# Read in main data set
data <- read.csv("raw/schma19962016.csv") %>%
clean_names()
addon <- read_csv("raw/stchso20052016.csv") %>%
clean_names()
## Parsed with column specification:
## cols(
## .default = col_logical(),
## DBN = col_character(),
## SCHNAM = col_character(),
## YEAR = col_double(),
## XDBN = col_character(),
## XDBN_FLAG = col_double(),
## BOROUGH = col_character(),
## DISTRICT = col_character(),
## BN = col_character(),
## BNLONG_NUMDIST = col_double(),
## BNLONG_BNDUP = col_double(),
## BNLONG_DISTDUP = col_double(),
## BNLONG = col_character(),
## BNLONG_FIXED = col_double(),
## BNLONG_YRFIRST = col_double(),
## BNLONG_YRLAST = col_double(),
## BNLONG_TOTALYRS = col_double(),
## BNLONG_PHASEDOUT19962016 = col_double(),
## BNLONG_PHASEDIN19962016 = col_double(),
## BNLONG_CONTOPERATE19962016 = col_double(),
## HSOSCHNAM = col_character()
## # ... with 31 more columns
## )
## See spec(...) for full column specifications.
## Warning: 1559679 parsing failures.
## row col expected actual file
## 1447 STCELANUMTSTG03 1/0/T/F/TRUE/FALSE 37 'raw/stchso20052016.csv'
## 1447 STCELANUMTSTG04 1/0/T/F/TRUE/FALSE 46 'raw/stchso20052016.csv'
## 1447 STCELANUMTSTG05 1/0/T/F/TRUE/FALSE 28 'raw/stchso20052016.csv'
## 1447 STCELANUMTSTG06 1/0/T/F/TRUE/FALSE 36 'raw/stchso20052016.csv'
## 1447 STCELANUMTSTTOT 1/0/T/F/TRUE/FALSE 147 'raw/stchso20052016.csv'
## .... ............... .................. ...... ........................
## See problems(...) for more details.
#Load in collected data about new and failing schools
old_new_schools <- read_excel("raw/oldnewssc.xlsx")
# Create a list of schools so that I can see how they are named in this data
schools <- data %>%
select(schnam) %>%
distinct(schnam) %>%
arrange(schnam)
# Create a list of NYC's specialized schools
selective_schools <- c("BRONX HIGH SCHOOL OF SCIENCE", "BROOKLYN TECHNICAL HIGH SCHOOL", "STATEN ISLAND TECHNICAL HS", "STUYVESANT HIGH SCHOOL", "BROOKLYN LATIN, THE", "QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEG", "HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINEERI", "HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE")
# Create a dataset that filters out observations for selective schools
data_selective_schools <- data %>%
filter(schnam %in% selective_schools)
## Parsed with column specification:
## cols(
## .default = col_double(),
## DBN = col_character(),
## Name = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 6 parsing failures.
## row col expected actual file
## 1124 fl_percent a double n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 2154 fl_percent a double n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 2161 fl_percent a double n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 4299 fl_percent a double n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## 8523 fl_percent a double n/a 'raw/enrollmentdata/2006_-_2012_School_Demographics_and_Accountability_Snapshot.csv'
## .... .......... ........ ...... ....................................................................................
## See problems(...) for more details.
## # A tibble: 2,322 x 39
## dbn name schoolyear fl_percent frl_percent total_enrollment prek k
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 01M2… HENR… 2008 99.5 NA 515 NA NA
## 2 01M2… HENR… 2009 95.3 NA 470 NA NA
## 3 01M2… HENR… 2010 NA 69.9 511 NA NA
## 4 01M2… HENR… 2011 NA 73.3 448 NA NA
## 5 01M2… HENR… 2012 NA 88.6 422 NA NA
## 6 01M4… UNIV… 2006 71.5 NA 478 NA NA
## 7 01M4… UNIV… 2007 71.5 NA 533 NA NA
## 8 01M4… UNIV… 2008 71.5 NA 588 NA NA
## 9 01M4… UNIV… 2009 60.9 NA 586 NA NA
## 10 01M4… UNIV… 2010 NA 73.4 532 NA NA
## # … with 2,312 more rows, and 31 more variables: grade1 <dbl>, grade2 <dbl>,
## # grade3 <dbl>, grade4 <dbl>, grade5 <dbl>, grade6 <dbl>, grade7 <dbl>,
## # grade8 <dbl>, grade9 <dbl>, grade10 <dbl>, grade11 <dbl>, grade12 <dbl>,
## # ell_num <dbl>, ell_percent <dbl>, sped_num <dbl>, sped_percent <dbl>,
## # ctt_num <dbl>, selfcontained_num <dbl>, asian_num <dbl>, asian_per <dbl>,
## # black_num <dbl>, black_per <dbl>, hispanic_num <dbl>, hispanic_per <dbl>,
## # white_num <dbl>, white_per <dbl>, male_num <dbl>, male_per <dbl>,
## # female_num <dbl>, female_per <dbl>, total_hs_enrollment <dbl>
## Parsed with column specification:
## cols(
## .default = col_double(),
## DBN = col_character(),
## `School Name` = col_character(),
## Year = col_character(),
## `% Female` = col_character(),
## `% Male` = col_character(),
## `% Asian` = col_character(),
## `% Black` = col_character(),
## `% Hispanic` = col_character(),
## `% Other` = col_character(),
## `% White` = col_character(),
## `% Students with Disabilities` = col_character(),
## `% English Language Learners` = col_character(),
## `% Poverty` = col_character()
## )
## See spec(...) for full column specifications.
# Filter for high schools (defined as schools serving 9-12 at least)
data_hs <- data %>%
filter(enrnumg12 != 0 & enrnumg11 != 0 & enrnumg10 != 0 & enrnumg09 != 0 ) %>%
group_by(year) %>%
mutate(percent_rank = percent_rank(hsopcthsgtot))
# Plot all schools by year and graduation rates
data_hs %>%
filter(hsopcthsgtot != is.na(hsopcthsgtot)) %>%
select(schnam, year, hsopcthsgtot) %>%
ggplot(aes(x = year, y = hsopcthsgtot, color = schnam)) +
geom_jitter() +
theme(legend.position = "none")

# Plot percentile rank of school over time
data_hs %>%
filter(schnam %in% selective_schools) %>%
ggplot(aes(x= year, y = percent_rank, color = schnam)) +
geom_line() +
theme(legend.position = "none")
## Warning: Removed 79 rows containing missing values (geom_path).

# Find out valid observations of graduation rates per year
data %>%
group_by(year) %>%
filter(hsopctdchtot != is.na(hsopcthsgtot)) %>%
select(schnam, year, hsopcthsgtot) %>%
count()
## # A tibble: 9 x 2
## # Groups: year [9]
## year n
## <int> <int>
## 1 2000 4
## 2 2001 6
## 3 2002 55
## 4 2003 57
## 5 2004 214
## 6 2005 213
## 7 2006 223
## 8 2007 276
## 9 2008 329

# Join the three data sets
# Because demo2006-2016 is missing 2005 data. I pulled those numbers from SCHMA
# demo2006-2016 has more reliable data, excpt for 2005
data_join <- data_hs %>%
full_join(grad_rates_clean, by = c("dbn", "year" = "grad_year")) %>%
full_join(demo_20062016, by = c("dbn", "year" = "schoolyear")) %>%
mutate(total_hs_enrollment_schnam = enrnumg09 + enrnumg10 + enrnumg11 + enrnumg12) %>%
mutate(total_hs_enrollment = coalesce(total_hs_enrollment, as.double(total_hs_enrollment_schnam)))
## Warning: Column `dbn` joining factor and character vector, coercing into
## character vector
# tool to test commands
#data_join %>%
# select(dbn, schnam, year, total_hs_enrollment, total_hs_enrollment_schnam) %>%
# arrange(dbn, year) %>%
# mutate(total_hs_enrollment = coalesce(total_hs_enrollment, as.double(total_hs_enrollment_schnam)))
## Adding missing grouping variables: `year`
## Warning: Removed 189 rows containing missing values (geom_path).

## Warning: Removed 689 rows containing missing values (geom_path).



## Adding missing grouping variables: `year`
## Warning: `cols` is now required.
## Please use `cols = c(data)`
## Warning: Removed 23 rows containing non-finite values (stat_smooth).
## Warning: Removed 23 rows containing missing values (geom_point).

# Plot a line graph showing the relationship between year and graduation rates
# per type of school
replacement_schools_grad_vs_failing <- data_join %>%
filter(dbn %in% replacement_schools |
dbn %in% large_failing_schools |
dbn %in% failing_2005_large_open_shrank_list |
dbn %in% failing_2005_large_closed_list |
dbn %in% failing_2005_large_open_2015_list) %>%
# Did this to see if there was any pre-2005 data on large failing schools but seems like there isn't
#mutate(total_grads_percent_of_cohort = coalesce(total_grads_percent_of_cohort, hsopcthsgnyc/100)) %>%
select(dbn, year, total_grads_percent_of_cohort) %>%
arrange(dbn, year) %>%
mutate(data_cat = ifelse(dbn %in% large_failing_schools,
"Announced for Closure in 2005",
ifelse(dbn %in% replacement_schools,
"Small Replacement Schools opened by 2005",
ifelse(dbn %in% failing_2005_large_closed_list,
"Large Eventually Closed",
ifelse(dbn %in% failing_2005_large_open_shrank_list,
"Large and Shrank",
"Large and Operating"))))) %>%
# Picked 2008 as an arbitrary date, just to show what graduation rates were like
# for schools that were closed
filter(!dbn %in% large_failing_schools | year <= 2008) %>%
# Started at '06 because 2005 had a small sample size of all the schools
# Data was also kinda wonky but not sure if that's a good reason for excluding
filter(!dbn %in% replacement_schools | year >= 2006)
# Plot it
replacement_schools_grad_vs_failing %>%
ggplot(aes(x = year, y = total_grads_percent_of_cohort, color = data_cat)) +
geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 493 rows containing non-finite values (stat_smooth).

replacement_schools_grad_vs_failing %>%
group_by(year, data_cat) %>%
summarize(avg_grad = mean(total_grads_percent_of_cohort, na.rm =TRUE)) %>%
ggplot(aes(x = year, y = avg_grad, color = data_cat)) +
geom_point() +
geom_line() +
xlim(2005, 2015) +
ylim(0.25, 0.8)
## Warning: Removed 39 rows containing missing values (geom_point).
## Warning: Removed 39 rows containing missing values (geom_path).

data_join %>%
filter(dbn == "29Q272") %>%
select(dbn, school, total_grads_percent_of_cohort, hsonumcohtot, hsonumcohnyc, hsonumhsgtot)
## Adding missing grouping variables: `year`
## # A tibble: 12 x 7
## # Groups: year [12]
## year dbn school total_grads_per… hsonumcohtot hsonumcohnyc hsonumhsgtot
## <dbl> <chr> <chr> <dbl> <int> <int> <int>
## 1 2005 29Q272 GEORGE … 1 24 21 21
## 2 2006 29Q272 GEORGE … 0.737 43 39 28
## 3 2007 29Q272 GEORGE … 0.589 127 97 54
## 4 2008 29Q272 GEORGE … 0.651 97 65 42
## 5 2009 29Q272 GEORGE … 0.766 NA 79 60
## 6 2010 29Q272 GEORGE … 0.853 NA 103 88
## 7 2011 29Q272 GEORGE … 0.826 NA 111 91
## 8 2012 29Q272 GEORGE … 0.769 NA 110 84
## 9 2013 29Q272 GEORGE … 0.826 NA 95 76
## 10 2014 29Q272 GEORGE … 0.838 NA 112 94
## 11 2015 29Q272 GEORGE … 0.759 NA 137 101
## 12 2016 29Q272 <NA> NA NA NA NA

# Create dataset that describes 20,50,80 percentiles of regents/cohort per year
grad_rates_regents_cohort <- grad_rates_clean %>%
group_by(cohort_year) %>%
nest() %>%
mutate(percentile_20 = map_dbl(data, ~ quantile(.[[9]], probs = .2)),
percentile_80 = map_dbl(data, ~ quantile(.[[9]], probs = .8)),
percentile_50 = map_dbl(data, ~ quantile(.[[9]], probs = .5))) %>%
arrange(cohort_year) %>%
select(-data)
#Graph the median result with errorbars at 80 and 20
grad_rates_regents_cohort %>%
ggplot(aes(x = cohort_year, y = percentile_50)) +
geom_point() +
geom_errorbar(aes(ymin = percentile_20, ymax = percentile_80))

# Create dataset that describes 20,50,80 percentiles of regents/grads per year
grad_rates_regents_grads <- grad_rates_clean %>%
group_by(cohort_year) %>%
nest() %>%
mutate(percentile_20 = map_dbl(data, ~ quantile(.[[10]], probs = .2, na.rm = TRUE)),
percentile_80 = map_dbl(data, ~ quantile(.[[10]], probs = .8, na.rm = TRUE)),
percentile_50 = map_dbl(data, ~ quantile(.[[10]], probs = .5, na.rm = TRUE))) %>%
arrange(cohort_year) %>%
select(-data)
#Graph the median result with errorbars at 80 and 20
grad_rates_regents_grads %>%
ggplot(aes(x = cohort_year, y = percentile_50)) +
geom_point() +
geom_errorbar(aes(ymin = percentile_20, ymax = percentile_80))

# Create dataset that describes 20,50,80 percentiles of adv regents/cohort
grad_rates_advregents_cohort <- grad_rates_clean %>%
group_by(cohort_year) %>%
nest() %>%
mutate(percentile_20 = map_dbl(data, ~ quantile(.[[13]], probs = .2, na.rm = TRUE)),
percentile_80 = map_dbl(data, ~ quantile(.[[13]], probs = .8, na.rm = TRUE)),
percentile_50 = map_dbl(data, ~ quantile(.[[13]], probs = .5, na.rm = TRUE))) %>%
arrange(cohort_year) %>%
select(-data)
#Graph the median result with errorbars at 80 and 20
grad_rates_advregents_cohort %>%
ggplot(aes(x = cohort_year, y = percentile_50)) +
geom_point() +
geom_errorbar(aes(ymin = percentile_20, ymax = percentile_80))

## # A tibble: 11 x 2
## # Groups: cohort_year [11]
## cohort_year n
## <dbl> <int>
## 1 2001 214
## 2 2002 246
## 3 2003 276
## 4 2004 320
## 5 2005 353
## 6 2006 367
## 7 2007 383
## 8 2008 404
## 9 2009 419
## 10 2010 431
## 11 2011 441